Note: In this script, I test tothosp (the total number of hospitalized days). I try to control for hospitalization unrelated to infelunza by excluding patients with underlying conditions from the tothosp analysis

Notes on data formatting from Deborah Wentworth:

Remove cases with underlying symptoms

(Attempt to control for hospitalization days attributable to other conditions than flu)

dat.003 = dat.003[dat.003$anydx != 1, ]

Visualize the relationship between age and duration of hospitalization

## 
##  Spearman's rank correlation rho
## 
## data:  valid$age and valid$tothospdays
## S = 984130, p-value = 1.948e-06
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.3245169

There is a positive correlation between age and duration of hospitalization.

Visualize the relationship between age and prob of H3N2 infection

## 
##  Spearman's rank correlation rho
## 
## data:  valid$age and valid$tothospdays
## S = 202550, p-value = 0.002142
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2787536

Results show a positive correlation between duration of hosp and age.

Visualize the relationship between country and symptom duration

H1N1

H3N2

Some minor effects of country

Visualize the relationship between season and symptom duration

H1N1

No clear effect of season.

H3N2

No clear effect of season –> Try models that don’t include season and country as blocking vars

H1N1 10-fold CV

We know the relationship between age and probability of infection should be non-linear. Use cross-validation to verify, and to choose the number of degrees of freedom to include in a spline

Plot the overall test error rate vs. df

Use linear fit.

H3N2 10-fold CV

We know the relationship between age and probability of infection should be non-linear. Use cross-validation to verify, and to choose the number of degrees of freedom to include in a spline

Plot the overall test error rate vs. df

Conclusion: Use 2 df

Set up a logistic regression that includes the following predictors:

  • age
  • imprinting status
  • vaccination status
  • antiviral use
  • season
  • country

Clean data for fitting

## integer(0)
## integer(0)
## integer(0)
## integer(0)
## integer(0)
## integer(0)
## integer(0)
## 
## Argentina Australia   Austria   Belgium     China   Denmark   Germany 
##         8        15         2         3         5        26         8 
##    Greece    Norway      Peru    Poland Singapore     Spain  Thailand 
##        28         1         7         4         1        14        32 
##        UK       USA 
##        37         7
## [1] 198

First fit a model where the only independent variable is age

Treat vaccination, antivial use, underlying symptoms, country and season as blocking variables with fixed effects

library(splines)
## H1N1 fit to medical history only (no effect of country, season, age or imprinting)
fit00 = glm(tothospdays ~ anyvac + anyav , data = train.H1N1)
summary(fit00)
## 
## Call:
## glm(formula = tothospdays ~ anyvac + anyav, data = train.H1N1)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -8.592  -5.592  -4.336  -0.592  73.408  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.536      1.921   3.402 0.000829 ***
## anyvac1       -2.457      3.079  -0.798 0.426007    
## anyav1         2.056      2.149   0.957 0.340003    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 138.0064)
## 
##     Null deviance: 24361  on 177  degrees of freedom
## Residual deviance: 24151  on 175  degrees of freedom
## AIC: 1387.2
## 
## Number of Fisher Scoring iterations: 2
## H1N1 fit to country, season and medical history only (no effect of age or imprinting)  
fit0 = glm(tothospdays ~ anyvac + anyav  + country + season, data = train.H1N1)
summary(fit0)
## 
## Call:
## glm(formula = tothospdays ~ anyvac + anyav + country + season, 
##     data = train.H1N1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -30.268   -5.195   -1.317    1.871   52.581  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.2323     6.3885   0.193 0.847313    
## anyvac1           -2.1808     3.1186  -0.699 0.485473    
## anyav1             4.1029     2.2118   1.855 0.065600 .  
## countryAustralia  -7.3290     5.2674  -1.391 0.166208    
## countryAustria    15.7163     9.7232   1.616 0.108161    
## countryBelgium    -1.1329     8.6900  -0.130 0.896455    
## countryChina       1.9765     8.1844   0.241 0.809512    
## countryDenmark    -0.9244     6.0594  -0.153 0.878956    
## countryGermany     5.3378     6.8884   0.775 0.439642    
## countryGreece     -0.1969     6.0284  -0.033 0.973993    
## countryNorway     -0.3352    12.2766  -0.027 0.978253    
## countryPeru       11.5772     7.4646   1.551 0.123065    
## countryPoland     -3.6089     8.0148  -0.450 0.653172    
## countrySingapore  -8.5219    14.1011  -0.604 0.546549    
## countrySpain       4.0535     6.5270   0.621 0.535536    
## countryThailand   -7.5219     5.1082  -1.473 0.143019    
## countryUK          1.7726     5.8337   0.304 0.761674    
## countryUSA         2.5464     7.1331   0.357 0.721622    
## seasonNH.10.11     4.5989     3.3854   1.358 0.176404    
## seasonNH.11.12    -1.7786    12.0092  -0.148 0.882462    
## seasonNH.12.13    25.9567     7.0358   3.689 0.000316 ***
## seasonNH.13.14     4.3108     3.3393   1.291 0.198750    
## seasonNH.14.15    25.2416     6.1767   4.087 7.17e-05 ***
## seasonNH.15.16     2.3157     2.8054   0.825 0.410456    
## seasonNH.16.17     5.1867     8.5055   0.610 0.542935    
## seasonSH.10        6.6867     8.5055   0.786 0.433041    
## seasonSH.11        6.5842     9.2856   0.709 0.479402    
## seasonSH.13        9.9673     6.6713   1.494 0.137306    
## seasonSH.14       19.8546     7.3322   2.708 0.007575 ** 
## seasonSH.16        5.3112     4.3697   1.215 0.226142    
## seasonSH.17        4.1867    11.3985   0.367 0.713924    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 115.1654)
## 
##     Null deviance: 24361  on 177  degrees of freedom
## Residual deviance: 16929  on 147  degrees of freedom
## AIC: 1379.9
## 
## Number of Fisher Scoring iterations: 2
## H1N1 fit to age only, plus fixed effects of vaccination, av use, underlying conditions, country and season
fit1 = glm(tothospdays ~ ns(age, knots = 65) + anyvac + anyav  + country + season, data = train.H1N1)
summary(fit1)
## 
## Call:
## glm(formula = tothospdays ~ ns(age, knots = 65) + anyvac + anyav + 
##     country + season, data = train.H1N1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -30.137   -4.827   -1.107    2.042   53.322  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.78506    6.81294  -0.409 0.683297    
## ns(age, knots = 65)1 10.66178    5.86569   1.818 0.071182 .  
## ns(age, knots = 65)2  4.73289    7.35609   0.643 0.520983    
## anyvac1              -1.92200    3.12520  -0.615 0.539517    
## anyav1                4.20919    2.21731   1.898 0.059639 .  
## countryAustralia     -5.56228    5.38307  -1.033 0.303188    
## countryAustria       16.77486    9.69580   1.730 0.085737 .  
## countryBelgium        1.36901    8.75488   0.156 0.875958    
## countryChina          2.59938    8.15419   0.319 0.750354    
## countryDenmark        0.30909    6.08108   0.051 0.959532    
## countryGermany        7.25647    6.93447   1.046 0.297102    
## countryGreece        -0.05434    6.00439  -0.009 0.992792    
## countryNorway         1.08135   12.25612   0.088 0.929816    
## countryPeru          11.06801    7.48680   1.478 0.141487    
## countryPoland        -1.17634    8.08565  -0.145 0.884530    
## countrySingapore     -8.70823   14.15569  -0.615 0.539403    
## countrySpain          4.51538    6.53235   0.691 0.490524    
## countryThailand      -6.52388    5.12924  -1.272 0.205445    
## countryUK             3.12786    5.85923   0.534 0.594273    
## countryUSA            3.18720    7.14145   0.446 0.656049    
## seasonNH.10.11        4.30570    3.37332   1.276 0.203855    
## seasonNH.11.12        0.66725   12.03208   0.055 0.955852    
## seasonNH.12.13       25.14195    7.05756   3.562 0.000498 ***
## seasonNH.13.14        4.53638    3.32727   1.363 0.174872    
## seasonNH.14.15       25.09532    6.15516   4.077 7.49e-05 ***
## seasonNH.15.16        1.85558    2.80403   0.662 0.509179    
## seasonNH.16.17        5.19523    8.50961   0.611 0.542477    
## seasonSH.10           8.01524    8.50943   0.942 0.347798    
## seasonSH.11           7.20703    9.28035   0.777 0.438666    
## seasonSH.13          10.66405    6.72724   1.585 0.115098    
## seasonSH.14          20.31012    7.42238   2.736 0.006990 ** 
## seasonSH.16           4.91236    4.35758   1.127 0.261471    
## seasonSH.17           4.35156   11.34880   0.383 0.701957    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 114.0787)
## 
##     Null deviance: 24361  on 177  degrees of freedom
## Residual deviance: 16541  on 145  degrees of freedom
## AIC: 1379.8
## 
## Number of Fisher Scoring iterations: 2
## H1N1 fit to age plus imprinting, plus fixed effects of vaccination, av use, underlying conditions, country and season
fit2 = glm(tothospdays ~ ns(age, knots = 65) + p.g1.protection + anyvac + anyav  + country + season, data = train.H1N1)
summary(fit2)
## 
## Call:
## glm(formula = tothospdays ~ ns(age, knots = 65) + p.g1.protection + 
##     anyvac + anyav + country + season, data = train.H1N1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -30.133   -4.843   -1.129    2.020   53.336  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -2.76723    6.86253  -0.403 0.687372    
## ns(age, knots = 65)1 10.80940    7.68503   1.407 0.161714    
## ns(age, knots = 65)2  4.85373    8.41711   0.577 0.565076    
## p.g1.protection      -0.09706    3.24884  -0.030 0.976207    
## anyvac1              -1.91654    3.14136  -0.610 0.542758    
## anyav1                4.20237    2.23669   1.879 0.062289 .  
## countryAustralia     -5.57467    5.41762  -1.029 0.305210    
## countryAustria       16.78888    9.74068   1.724 0.086929 .  
## countryBelgium        1.36612    8.78573   0.155 0.876651    
## countryChina          2.62361    8.22254   0.319 0.750132    
## countryDenmark        0.30929    6.10215   0.051 0.959647    
## countryGermany        7.24344    6.97214   1.039 0.300587    
## countryGreece        -0.04880    6.02803  -0.008 0.993551    
## countryNorway         1.05042   12.34209   0.085 0.932293    
## countryPeru          11.05330    7.52884   1.468 0.144251    
## countryPoland        -1.17135    8.11537  -0.144 0.885436    
## countrySingapore     -8.76454   14.32923  -0.612 0.541730    
## countrySpain          4.51935    6.55631   0.689 0.491736    
## countryThailand      -6.53670    5.16485  -1.266 0.207696    
## countryUK             3.13385    5.88294   0.533 0.595061    
## countryUSA            3.18101    7.16918   0.444 0.657921    
## seasonNH.10.11        4.31268    3.39306   1.271 0.205768    
## seasonNH.11.12        0.70217   12.13018   0.058 0.953920    
## seasonNH.12.13       25.14070    7.08212   3.550 0.000521 ***
## seasonNH.13.14        4.53546    3.33894   1.358 0.176475    
## seasonNH.14.15       25.08660    6.18337   4.057 8.11e-05 ***
## seasonNH.15.16        1.84625    2.83102   0.652 0.515344    
## seasonNH.16.17        5.22616    8.60160   0.608 0.544423    
## seasonSH.10           8.01965    8.54018   0.939 0.349277    
## seasonSH.11           7.23035    9.34515   0.774 0.440376    
## seasonSH.13          10.68226    6.77801   1.576 0.117216    
## seasonSH.14          20.32178    7.45829   2.725 0.007233 ** 
## seasonSH.16           4.91872    4.37785   1.124 0.263074    
## seasonSH.17           4.36440   11.39620   0.383 0.702307    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 114.8702)
## 
##     Null deviance: 24361  on 177  degrees of freedom
## Residual deviance: 16541  on 144  degrees of freedom
## AIC: 1381.8
## 
## Number of Fisher Scoring iterations: 2
## H1N1 fit to age only, plus fixed effects of vaccination, av use, underlying conditions NO COUNTRY AND SEASON
fit3 = glm(tothospdays ~ ns(age, knots = 65) + anyvac + anyav , data = train.H1N1)
summary(fit3)
## 
## Call:
## glm(formula = tothospdays ~ ns(age, knots = 65) + anyvac + anyav, 
##     data = train.H1N1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -10.998   -5.281   -3.344    0.200   72.905  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)             3.400      2.786   1.220    0.224  
## ns(age, knots = 65)1   11.680      5.645   2.069    0.040 *
## ns(age, knots = 65)2    5.517      7.372   0.748    0.455  
## anyvac1                -1.930      3.107  -0.621    0.535  
## anyav1                  1.828      2.154   0.848    0.397  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 136.1311)
## 
##     Null deviance: 24361  on 177  degrees of freedom
## Residual deviance: 23551  on 173  degrees of freedom
## AIC: 1386.7
## 
## Number of Fisher Scoring iterations: 2
## H1N1 fit to age plus imprinting, plus fixed effects of vaccination, av use, underlying conditions, NO COUNTRY AND SEASON
fit4 = glm(tothospdays ~ ns(age, knots = 65) + p.g1.protection + anyvac + anyav, data = train.H1N1)
summary(fit4)
## 
## Call:
## glm(formula = tothospdays ~ ns(age, knots = 65) + p.g1.protection + 
##     anyvac + anyav, data = train.H1N1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -10.407   -5.244   -3.438    0.109   73.618  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)
## (Intercept)             3.083      2.869   1.075    0.284
## ns(age, knots = 65)1    9.323      7.483   1.246    0.214
## ns(age, knots = 65)2    3.435      8.561   0.401    0.689
## p.g1.protection         1.571      3.263   0.481    0.631
## anyvac1                -1.991      3.116  -0.639    0.524
## anyav1                  1.943      2.172   0.894    0.372
## 
## (Dispersion parameter for gaussian family taken to be 136.7383)
## 
##     Null deviance: 24361  on 177  degrees of freedom
## Residual deviance: 23519  on 172  degrees of freedom
## AIC: 1388.5
## 
## Number of Fisher Scoring iterations: 2
## Plot the fits for each model side by side
par(mfrow = c(1, 2), las = 3, mar = c(6, 6, 6, 3), cex.axis = .7)
library(gam)
## Loading required package: foreach
## Loaded gam 1.14-4
plot.gam(fit00, se = T, col = 'dodgerblue', main = 'Simplified Baseline')

plot.gam(fit0, se = T, col = 'dodgerblue', main = 'Baseline')

plot.gam(fit1, se = T, col = 'dodgerblue', main = 'Baseline + age + imp')

plot.gam(fit2, se = T, col = 'dodgerblue', main = 'Baseline + age + imp')

plot.gam(fit3, se = T, col = 'dodgerblue', main = 'Simple baseline + age + imp')

plot.gam(fit4, se = T, col = 'dodgerblue', main = 'Simple baseline + age + imp')

pdf('003Tothospdays_H1N1.pdf')
plot.gam(fit4, se = T, col = 'dodgerblue', main = 'Simple baseline + age + imp')
dev.off()
## quartz_off_screen 
##                 2
# Predict probs for each patient
pred00 = predict(fit00, newdata = test.H1N1)
pred3 = predict(fit3, newdata = test.H1N1)
pred4 = predict(fit4, newdata = test.H1N1)
  
# Estimate the simulated error rate
MSE = numeric(6); names(MSE) = c('simple.baseline', 'simple.baseline+age', 'simple.baseline+age+imp')
MSE[1] = mean((pred00 - test.H1N1$tothospdays)^2)
MSE[2] = mean((pred3 - test.H1N1$tothospdays)^2)
MSE[3] = mean((pred4 - test.H1N1$tothospdays)^2)
sort(MSE)
##                    <NA>                    <NA>                    <NA> 
##                 0.00000                 0.00000                 0.00000 
## simple.baseline+age+imp     simple.baseline+age         simple.baseline 
##                77.55715                77.87408                80.51232
AIC = c(fit00$aic, fit0$aic, fit1$aic, fit2$aic, fit3$aic, fit4$aic); names(AIC) =  c('simple.baseline', 'baseline', 'baeline+age', 'baseline+age+imp', 'simple.baseline+age', 'simple.baseline+age+imp')
del.AIC = sort(AIC - min(AIC))
del.AIC
##             baeline+age                baseline        baseline+age+imp 
##               0.0000000               0.1259588               1.9988967 
##     simple.baseline+age         simple.baseline simple.baseline+age+imp 
##               6.8851472               7.3664140               8.6455012
load('master.AIC.table.H1N1.RData')
master.AIC.table['003Tothospdays',  c('simple.baseline', 'baseline', 'baseline+age', 'baseline+age+imp', 'simple.baseline+age', 'simple.baseline+age+imp')] = AIC - min(AIC)
save(master.AIC.table, file = 'master.AIC.table.H1N1.RData')
rm(master.AIC.table)

Clean H3N2 data for fitting

## integer(0)
## integer(0)
## integer(0)
## integer(0)
## integer(0)
## integer(0)
## integer(0)
## 
## Argentina Australia   Belgium     China   Denmark   Germany      Peru 
##         7        14         1         2         8         2         4 
## Singapore     Spain  Thailand        UK       USA 
##         1         2        50        12        12
## [1] 115
library(splines)
## H3N2 fit to medical history only (no effect of country, season, age or imprinting)
fit00 = glm(tothospdays ~ anyvac + anyav, data = train.H3N2)
summary(fit00)
## 
## Call:
## glm(formula = tothospdays ~ anyvac + anyav, data = train.H3N2)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -7.918  -2.494  -1.494  -0.494  59.506  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.9183     1.9899   3.979 0.000133 ***
## anyvac1      -0.2311     2.0335  -0.114 0.909754    
## anyav1       -3.4247     2.1310  -1.607 0.111287    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 61.02857)
## 
##     Null deviance: 6078.0  on 99  degrees of freedom
## Residual deviance: 5919.8  on 97  degrees of freedom
## AIC: 699.88
## 
## Number of Fisher Scoring iterations: 2
## H3N2 fit to country, season and medical history only (no effect of age or imprinting)  
fit0 = glm(tothospdays ~ anyvac + anyav + country + season, data = train.H3N2)
summary(fit0)
## 
## Call:
## glm(formula = tothospdays ~ anyvac + anyav + country + season, 
##     data = train.H3N2)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -23.1545   -2.1049   -0.3286    1.1937   23.2288  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       10.5040     5.8944   1.782  0.07869 .  
## anyvac1            0.2181     2.0477   0.106  0.91547    
## anyav1            -2.8636     2.1853  -1.310  0.19395    
## countryAustralia -15.1388     4.8277  -3.136  0.00243 ** 
## countryDenmark    -8.3283     4.7897  -1.739  0.08607 .  
## countryOther      -1.4478     4.7126  -0.307  0.75950    
## countryPeru       -2.5050     5.4054  -0.463  0.64437    
## countryThailand   -9.9565     3.9042  -2.550  0.01275 *  
## countryUK         -9.6325     4.7011  -2.049  0.04387 *  
## countryUSA        -6.0500     4.9102  -1.232  0.22165    
## seasonNH.12.13     1.9628     4.6814   0.419  0.67619    
## seasonNH.13.14     3.3207     5.3483   0.621  0.53651    
## seasonNH.14.15     3.1407     4.6702   0.672  0.50328    
## seasonNH.15.16     5.3943     5.1282   1.052  0.29614    
## seasonNH.16.17     6.1773     4.9334   1.252  0.21431    
## seasonSH.10       34.6529     6.4265   5.392 7.39e-07 ***
## seasonSH.11        1.3596     7.1632   0.190  0.84996    
## seasonSH.12       12.4984     8.4172   1.485  0.14167    
## seasonSH.13        6.8796     6.0663   1.134  0.26028    
## seasonSH.14       -2.1349     4.9166  -0.434  0.66534    
## seasonSH.15        4.9001     5.0081   0.978  0.33093    
## seasonSH.16       10.2498     5.4268   1.889  0.06269 .  
## seasonSH.17       10.5085     5.4685   1.922  0.05835 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 40.55114)
## 
##     Null deviance: 6078.0  on 99  degrees of freedom
## Residual deviance: 3122.4  on 77  degrees of freedom
## AIC: 675.91
## 
## Number of Fisher Scoring iterations: 2
## H3N2 fit to age only, plus fixed effects of vaccination, av use, underlying conditions, country and season
fit1 = glm(tothospdays ~ ns(age, knots = 65) + anyvac + anyav + country + season, data = train.H3N2)
summary(fit1)
## 
## Call:
## glm(formula = tothospdays ~ ns(age, knots = 65) + anyvac + anyav + 
##     country + season, data = train.H3N2)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -19.9770   -2.5916   -0.1418    1.7009   20.8286  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           13.6056     5.8599   2.322  0.02296 *  
## ns(age, knots = 65)1   0.8211     3.7709   0.218  0.82822    
## ns(age, knots = 65)2   9.3137     3.1174   2.988  0.00380 ** 
## anyvac1               -1.4179     2.0381  -0.696  0.48875    
## anyav1                -2.7812     2.0921  -1.329  0.18774    
## countryAustralia     -15.2190     4.6322  -3.285  0.00155 ** 
## countryDenmark        -8.5429     4.7058  -1.815  0.07346 .  
## countryOther          -2.2521     4.5698  -0.493  0.62357    
## countryPeru           -5.3021     5.3371  -0.993  0.32369    
## countryThailand      -10.1534     3.7805  -2.686  0.00891 ** 
## countryUK            -10.3387     4.5860  -2.254  0.02709 *  
## countryUSA            -6.3205     4.7691  -1.325  0.18910    
## seasonNH.12.13        -1.5773     4.6609  -0.338  0.73600    
## seasonNH.13.14         1.9665     5.2408   0.375  0.70855    
## seasonNH.14.15         0.9677     4.5742   0.212  0.83303    
## seasonNH.15.16         2.5605     5.0225   0.510  0.61168    
## seasonNH.16.17         4.3694     4.8023   0.910  0.36582    
## seasonSH.10           29.7895     6.3831   4.667 1.31e-05 ***
## seasonSH.11           -1.1338     6.9292  -0.164  0.87046    
## seasonSH.12            9.2825     8.1489   1.139  0.25828    
## seasonSH.13            5.3059     5.8360   0.909  0.36618    
## seasonSH.14           -2.4024     4.7144  -0.510  0.61184    
## seasonSH.15            2.4069     4.9119   0.490  0.62555    
## seasonSH.16            7.6984     5.2718   1.460  0.14838    
## seasonSH.17            8.2961     5.3033   1.564  0.12195    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 37.15256)
## 
##     Null deviance: 6078.0  on 99  degrees of freedom
## Residual deviance: 2786.4  on 75  degrees of freedom
## AIC: 668.52
## 
## Number of Fisher Scoring iterations: 2
## H3N2 fit to age plus imprinting, plus fixed effects of vaccination, av use, underlying conditions, country and season
fit2 = glm(tothospdays ~ ns(age, knots = 65) + p.g2.protection + anyvac + anyav + country + season, data = train.H3N2)
summary(fit2)
## 
## Call:
## glm(formula = tothospdays ~ ns(age, knots = 65) + p.g2.protection + 
##     anyvac + anyav + country + season, data = train.H3N2)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -19.8535   -2.6514   -0.2554    1.6159   20.8254  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            12.384      6.493   1.907  0.06037 .  
## ns(age, knots = 65)1    2.885      5.973   0.483  0.63047    
## ns(age, knots = 65)2   10.374      3.930   2.640  0.01011 *  
## p.g2.protection         1.457      3.257   0.447  0.65599    
## anyvac1                -1.441      2.050  -0.703  0.48426    
## anyav1                 -2.759      2.104  -1.311  0.19385    
## countryAustralia      -15.262      4.658  -3.276  0.00160 ** 
## countryDenmark         -8.501      4.732  -1.796  0.07651 .  
## countryOther           -2.275      4.595  -0.495  0.62199    
## countryPeru            -5.133      5.379  -0.954  0.34308    
## countryThailand       -10.106      3.802  -2.658  0.00963 ** 
## countryUK             -10.502      4.625  -2.271  0.02608 *  
## countryUSA             -6.236      4.798  -1.300  0.19775    
## seasonNH.12.13         -1.864      4.730  -0.394  0.69466    
## seasonNH.13.14          1.902      5.271   0.361  0.71930    
## seasonNH.14.15          1.012      4.600   0.220  0.82640    
## seasonNH.15.16          2.411      5.061   0.476  0.63516    
## seasonNH.16.17          4.340      4.829   0.899  0.37170    
## seasonSH.10            29.653      6.425   4.616 1.61e-05 ***
## seasonSH.11            -1.495      7.013  -0.213  0.83173    
## seasonSH.12             9.395      8.197   1.146  0.25540    
## seasonSH.13             5.089      5.887   0.864  0.39018    
## seasonSH.14            -2.258      4.751  -0.475  0.63597    
## seasonSH.15             2.369      4.939   0.480  0.63290    
## seasonSH.16             7.677      5.300   1.448  0.15171    
## seasonSH.17             8.130      5.345   1.521  0.13250    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 37.55311)
## 
##     Null deviance: 6078.0  on 99  degrees of freedom
## Residual deviance: 2778.9  on 74  degrees of freedom
## AIC: 670.25
## 
## Number of Fisher Scoring iterations: 2
## H3N2 fit to age only, plus fixed effects of vaccination, av use, underlying conditions NO COUNTRY AND SEASON
fit3 = glm(tothospdays ~ ns(age, knots = 65) + anyvac + anyav, data = train.H3N2)
summary(fit3)
## 
## Call:
## glm(formula = tothospdays ~ ns(age, knots = 65) + anyvac + anyav, 
##     data = train.H3N2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -10.214   -2.549   -1.017    0.947   51.500  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             8.588      2.355   3.647 0.000433 ***
## ns(age, knots = 65)1    1.444      4.067   0.355 0.723428    
## ns(age, knots = 65)2   11.944      3.457   3.455 0.000824 ***
## anyvac1                -2.201      2.034  -1.082 0.282012    
## anyav1                 -3.514      2.028  -1.733 0.086392 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 55.24726)
## 
##     Null deviance: 6078.0  on 99  degrees of freedom
## Residual deviance: 5248.5  on 95  degrees of freedom
## AIC: 691.84
## 
## Number of Fisher Scoring iterations: 2
## H3N2 fit to age plus imprinting, plus fixed effects of vaccination, av use, underlying conditions, NO COUNTRY AND SEASON
fit4 = glm(tothospdays ~ ns(age, knots = 65) + p.g2.protection + anyvac + anyav, data = train.H3N2)
summary(fit4)
## 
## Call:
## glm(formula = tothospdays ~ ns(age, knots = 65) + p.g2.protection + 
##     anyvac + anyav, data = train.H3N2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -10.058   -2.446   -0.979    0.852   51.203  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)             6.283      3.904   1.609  0.11091   
## ns(age, knots = 65)1    4.968      6.264   0.793  0.42967   
## ns(age, knots = 65)2   13.715      4.209   3.259  0.00156 **
## p.g2.protection         2.662      3.591   0.741  0.46042   
## anyvac1                -2.184      2.039  -1.071  0.28693   
## anyav1                 -3.419      2.037  -1.679  0.09655 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 55.51057)
## 
##     Null deviance: 6078  on 99  degrees of freedom
## Residual deviance: 5218  on 94  degrees of freedom
## AIC: 693.26
## 
## Number of Fisher Scoring iterations: 2
## Plot the fits for each model side by side
par(mfrow = c(1, 2), las = 3, mar = c(6, 6, 6, 3), cex.axis = .7)
library(gam)
plot.gam(fit00, se = T, col = 'firebrick1', main = 'Simplified Baseline')

plot.gam(fit0, se = T, col = 'firebrick1', main = 'Baseline')

plot.gam(fit1, se = T, col = 'firebrick1', main = 'Baseline + age + imp')

plot.gam(fit2, se = T, col = 'firebrick1', main = 'Baseline + age + imp')

plot.gam(fit3, se = T, col = 'firebrick1', main = 'Simple baseline + age + imp')

plot.gam(fit4, se = T, col = 'firebrick1', main = 'Simple baseline + age + imp')

pdf('003Tothospdays_H3N2.pdf')
plot.gam(fit4, se = T, col = 'firebrick1', main = 'Simple baseline + age + imp')
dev.off()
## quartz_off_screen 
##                 2
# Predict probs for each patient
pred00 = predict(fit00, newdata = test.H3N2)
pred3 = predict(fit3, newdata = test.H3N2)
pred4 = predict(fit4, newdata = test.H3N2)
  
# Estimate the simulated error rate
MSE = numeric(3); names(MSE) = c('simple.baseline', 'simple.baseline+age', 'simple.baseline+age+imp')
MSE[1] = mean((pred00 - test.H3N2$tothospdays)^2)
MSE[2] = mean((pred3 - test.H3N2$tothospdays)^2)
MSE[3] = mean((pred4 - test.H3N2$tothospdays)^2)
sort(MSE)
##         simple.baseline simple.baseline+age+imp     simple.baseline+age 
##                81.01654                83.27925                83.35676
AIC = c(fit00$aic, fit0$aic, fit1$aic, fit2$aic, fit3$aic, fit4$aic); names(AIC) =  c('simple.baseline', 'baseline', 'baseline+age', 'baseline+age+imp', 'simple.baseline+age', 'simple.baseline+age+imp')
del.AIC = sort(AIC - min(AIC))
del.AIC
##            baseline+age        baseline+age+imp                baseline 
##                0.000000                1.730035                7.384847 
##     simple.baseline+age simple.baseline+age+imp         simple.baseline 
##               23.317484               24.734733               31.353222
load('master.AIC.table.H3N2.RData')
master.AIC.table['003Tothospdays', c('simple.baseline', 'baseline', 'baseline+age', 'baseline+age+imp', 'simple.baseline+age', 'simple.baseline+age+imp')] = AIC - min(AIC)
save(master.AIC.table, file = 'master.AIC.table.H3N2.RData')
rm(master.AIC.table)